Arviz bandwidth

Here we include some examples and notes to understand how current arviz kde/bandwidth works. The notes are at the bottom of the document.

In [1]:
import numpy as np
from scipy import stats
from arviz_kde import fast_kde
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

import sys
sys.path.append("../")
import src
from src.bandwidth import bw_scott
plt.rcParams["figure.figsize"] = (18, 14)

BLUE = "#3498db"
GREEN = "#27ae60"
np.random.seed(1995)
In [2]:
def bw_az(x):
    bw = 4.5
    x_len = len(x)
    x_log_len = np.log(x_len) * bw

    # Amount of bins is usually 200, but there is also a heuristic
    n_points = 200
    n_bins = min(int(x_len ** (1 / 3) * x_log_len * 2), n_points)
    x_grid = np.linspace(x.min(), x.max(), num=n_bins)
    return (x_log_len * x_len ** -0.2) * (x_grid[1] - x_grid[0])
In [3]:
distributions_dict = {
    "norm": stats.norm,
    "gamma": stats.gamma,
    "lognorm": stats.lognorm,
    "beta": stats.beta,
    "t": stats.t,
    "vonmises": stats.vonmises
}

pdf_kwargs = {
    "gaussian_1": {
        "distribution": ["norm"],
        "params": [{"loc" : 0, "scale": 1}], 
        "wts": None,
        "bc": False 
    },
    "gmixture_1": {
        "distribution": ["norm", "norm"],
        "params": [{"loc": 0, "scale": 0.8}, {"loc": 8, "scale": 1}], 
        "wts": None,
        "bc": False
    },
    "gmixture_2": {
        "distribution": ["norm", "norm"],
        "params": [{"loc": 0, "scale": 0.15}, {"loc": 5, "scale": 1.2}], 
        "wts": None,
        "bc": False
    },
    "gamma_1": {
        "distribution": ["gamma"], 
        "params": [{"a": 1, "scale": 1}], 
        "wts": None,
        "bc": True
    },
    "gamma_2": {
        "distribution": ["gamma"], 
        "params": [{"a": 2, "scale": 1}], 
        "wts": None,
        "bc": True
    },
    "beta_1": {
        "distribution": ["beta"], 
        "params": [{"a" : 2.5, "b" : 1.5}], 
        "wts": None,
        "bc": True
    },
    "logn_1": {
        "distribution": ["lognorm"],         
        "params": [{"s": 1, "scale": 1}], 
        "wts": None,
        "bc": True
    },
    "logn_2": {
        "distribution": ["lognorm"],         
        "params": [{"s": 0.7, "scale": 2}], 
        "wts": None,
        "bc": True
    },
    "t_1": {
        "distribution": ["t"],         
        "params": [{"df": 2}], 
        "wts": None,
        "bc": False        
    },
    "skwd_mixture1": {
        "distribution": ["gamma", "norm", "norm"],
        "params": [{"a": 1.5, "scale": 1}, {"loc": 5, "scale": 1}, {"loc": 8, "scale": 0.75}],
        "wts": [0.7, 0.2, 0.1],
        "bc": True
    },
    "vonmises": {
        "distribution": ["vonmises"],
        "params": [{"kappa": 2, "loc": np.pi}],
        "wts": None,
        "bc": True
    },
}

def component_rvs(distribution, params, size):
    f = distributions_dict[distribution](**params)
    rvs = f.rvs(size)
    if distribution == "vonmises" and params["loc"] == np.pi:
        rvs[rvs > np.pi] = rvs[rvs > np.pi] - 2 * np.pi
    return rvs

def mixture_rvs(distributions, params, size, wts=None):
    if wts is None:
        wts = np.repeat(1 / len(distributions), len(distributions))
    assert len(distributions) == len(params) == len(wts)
    assert np.allclose(np.sum(wts), 1)
    sizes = np.round(np.array(wts) * size).astype(int)
    map_obj = map(lambda d, p, s: component_rvs(d, p, s), distributions, params, sizes)
    return np.concatenate(list(map_obj))

def distribution_bounds(distribution, params):
    bounds = {
    "norm" : lambda p: [p["loc"] - 3.5 * p["scale"], p["loc"] + 3.5 * p["scale"]],
    "gamma": lambda p: [0, stats.gamma.ppf(0.995, a=p["a"], scale=p["scale"])],
    "lognorm": lambda p: [0, stats.lognorm.ppf(0.995, s=p["s"], scale=p["scale"])],
    "beta": lambda p: [0, 1],
    "t": lambda p: [stats.t.ppf(0.01, df=p["df"]), stats.t.ppf(0.99, df=p["df"])],
    "vonmises": lambda p: [-np.pi, np.pi]
    }
    
    return bounds[distribution](params)

def mixture_bounds(distributions, params):
    map_obj = map(lambda d, p: distribution_bounds(d, p), distributions, params)
    vals = np.concatenate(list(map_obj))
    return [np.min(vals), np.max(vals)]

def mixture_grid(distributions, params):
    bounds = mixture_bounds(distributions, params)
    return np.linspace(bounds[0], bounds[1], num=500)

def component_pdf(distribution, params, grid):
    f = distributions_dict[distribution](**params)
    return f.pdf(grid)

def mixture_pdf(distributions, params, wts = None, grid = None, return_grid=False):
    if wts is None:
        wts = np.repeat(1 / len(distributions), len(distributions))
    assert len(distributions) == len(params) == len(wts)
    assert np.allclose(np.sum(wts), 1)
    
    if grid is None:
        grid = mixture_grid(distributions, params)
    
    pdf = np.average(list((map(lambda d, p: component_pdf(d, p, grid), distributions, params))), 
                     axis=0, weights=wts)
    if return_grid:
        return grid, pdf
    else:
        return pdf
In [4]:
def compare_bw(dist, sizes, niter=50):
    kwargs = pdf_kwargs[dist]
    distribution = kwargs["distribution"]
    params = kwargs["params"]
    wts = kwargs["wts"]
    bc = kwargs["bc"]
    
    grid, true_pdf = mixture_pdf(distribution, params, wts, return_grid=True)
        
    f = plt.figure(constrained_layout=True)
    gs = f.add_gridspec(3, 3)
    ax1 = f.add_subplot(gs[0, 0])
    ax2 = f.add_subplot(gs[0, 1], sharex = ax1, sharey=ax1)
    ax3 = f.add_subplot(gs[0, 2], sharex = ax1, sharey=ax1)
    ax4 = f.add_subplot(gs[1, 0], sharex = ax1, sharey=ax1)
    ax5 = f.add_subplot(gs[1, 1], sharex = ax1, sharey=ax1)
    ax6 = f.add_subplot(gs[1, 2], sharex = ax1, sharey=ax1)
    ax7 = f.add_subplot(gs[2, :])
    
    size_subset = [100, 1000, 5000]
    for _ in range(niter):
        rvs = mixture_rvs(distribution, params, 100, wts)
        x1, y1 = src.estimate_density(rvs, bw="scott", extend=False, bound_correction=bc)
        y2, xmin, xmax = fast_kde(rvs)
        x2 = np.linspace(xmin, xmax, len(y2))
        ax1.plot(x1, y1, lw=3, c=BLUE, alpha=0.35)
        ax4.plot(x2, y2, lw=3, c=GREEN, alpha=0.35)
        
        rvs = mixture_rvs(distribution, params, 1000, wts)
        x1, y1 = src.estimate_density(rvs, bw="scott", extend=False, bound_correction=bc)
        y2, xmin, xmax = fast_kde(rvs)
        x2 = np.linspace(xmin, xmax, len(y2))
        ax2.plot(x1, y1, lw=3, c=BLUE, alpha=0.35)
        ax5.plot(x2, y2, lw=3, c=GREEN, alpha=0.35)
        
        rvs = mixture_rvs(distribution, params, 5000, wts)
        x1, y1 = src.estimate_density(rvs, bw="scott", extend=False, bound_correction=bc)
        y2, xmin, xmax = fast_kde(rvs)
        x2 = np.linspace(xmin, xmax, len(y2))
        ax3.plot(x1, y1, lw=3, c=BLUE, alpha=0.35)
        ax6.plot(x2, y2, lw=3, c=GREEN, alpha=0.35)
    
    ax1.plot(grid, true_pdf, lw=5, ls="--", c="black")
    ax2.plot(grid, true_pdf, lw=5, ls="--", c="black")
    ax3.plot(grid, true_pdf, lw=5, ls="--", c="black")
    ax4.plot(grid, true_pdf, lw=5, ls="--", c="black")
    ax5.plot(grid, true_pdf, lw=5, ls="--", c="black")
    ax6.plot(grid, true_pdf, lw=5, ls="--", c="black")
    ax1.set_xlim(grid.min(), grid.max())
    ax2.set_xlim(grid.min(), grid.max())
    ax3.set_xlim(grid.min(), grid.max())
    ax4.set_xlim(grid.min(), grid.max())
    ax5.set_xlim(grid.min(), grid.max())
    ax6.set_xlim(grid.min(), grid.max())
    ax1.set_title("Size = 100")
    ax4.set_title("Size = 100")
    ax2.set_title("Size = 1000")
    ax5.set_title("Size = 1000")
    ax3.set_title("Size = 5000")
    ax6.set_title("Size = 5000")
    
    for _ in range(niter):
        rvs_list = [mixture_rvs(distribution, params, size, wts) for size in sizes]
        scott_vals = [bw_scott(rvs) for rvs in rvs_list]
        az_vals = [bw_az(rvs) for rvs in rvs_list]
        
        ax7.plot(sizes, scott_vals, lw=2, c=BLUE, alpha=0.35)
        ax7.plot(sizes, az_vals, lw=2, c=GREEN, alpha=0.35)
    
    ax7.set_xticks(sizes)
    ax7.set_xlabel("Size")
    ax7.set_ylabel("Bandwidth")
    
    colors = [BLUE, GREEN]
    lines = [Line2D([0], [0], color=c, linewidth=3) for c in colors]
    labels = ["Scott's rule", "Arviz rule"]
    ax7.legend(lines, labels)
In [5]:
sizes = [100, 200, 500, 1000, 2000, 5000]
compare_bw("gaussian_1", sizes, 40)
In [6]:
compare_bw("gmixture_1", sizes, 40)
In [7]:
compare_bw("gmixture_2", sizes, 40)
In [8]:
compare_bw("logn_1", sizes, 40)
In [9]:
compare_bw("logn_2", sizes, 40)
In [10]:
compare_bw("gamma_1", sizes, 40)
In [11]:
compare_bw("gamma_2", sizes, 40)
In [12]:
compare_bw("beta_1", sizes, 40)
In [13]:
compare_bw("t_1", sizes, 40)
In [14]:
compare_bw("skwd_mixture1", sizes, 40)
In [15]:
compare_bw("vonmises", sizes, 40)

Notes:

Here we don't try to demonstrate any superiority of Scott's rule over Arviz bandwidth. Here we try to study Arviz bandwidth rule and use a well-known rule such as Scott's only for comparison purposes.

Arviz bandwidth is computed as $\text{bw} = 4.5 \log(N) N ^{-0.2}$. This computation is not in the same scale than, for example, Scott's bandwidth, so we can't compare them directly. This computation is in the scale of the scaled bandwidth one uses when using binning and convolution to produce the Gaussian KDE.

If we want to put Arviz bandwidth in the same scale than other rules/methods, we need to multiply its result by the bin width. In other words, Arviz bandwidth directly depends on the spacing of the grid used to compute the density estimate.

Next, some notes:

  • Arviz bandwidth is usually too small when the sample size is low.
  • Sometimes, there is a sample size where Scott's rule and Arviz bandwidth are very close. But they do not tend to converge to the a similar value in the limit.
  • These bandwidths tend to differ in the limit. But it is important to note that when the sample size is very large, the apparently very different bandwidths can result in similar density estimates due to the effect of having so many data points (you can think that as the sample size gets larger, there is more room for using a non-optimal bandwidth and still have pretty good results).
  • Since arviz bandwidth depends on the bin width, distributions with long tails are associated with higher bandwidth estimates, resulting in oversmooth even with large sample sizes. See log-norm and t distributions examples.
  • For arviz rule, it is not true that more grid points result in a more precise estimation as one would expect for any procedure that approximates a continuous space with a discrete one. For example, ff we had used more than the default 200 points for the beta distribution, the estimation would have been even more wiggly.
  • The variability of the arviz bandwidth is higher.
  • One can also see that the 256 points used for the t-distribution are not enough when using Scott's rule.